Load raw data, annotate probes using biomaRt and load SFARI genes

# Load csvs
datExpr = read.csv('./../raw_data/RNAseq_ASD_datExpr.csv', row.names=1)
datMeta = read.csv('./../raw_data/RNAseq_ASD_datMeta.csv')
SFARI_genes = read_csv('./../working_data/SFARI_genes_01-15-2019.csv')

# Make sure datExpr and datMeta columns/rows match
rownames(datMeta) = paste0('X', datMeta$Dissected_Sample_ID)
if(!all(colnames(datExpr) == rownames(datMeta))){
  print('Columns in datExpr don\'t match the rowd in datMeta!')
}

# Annotate probes
getinfo = c('ensembl_gene_id','external_gene_id','chromosome_name','start_position',
            'end_position','strand','band','gene_biotype','percentage_gc_content')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL',
               dataset='hsapiens_gene_ensembl',
               host='feb2014.archive.ensembl.org') ## Gencode v19
datProbes = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), values=rownames(datExpr), mart=mart)
datProbes = datProbes[match(rownames(datExpr), datProbes$ensembl_gene_id),]
datProbes$length = datProbes$end_position-datProbes$start_position

# Group brain regions by lobes
datMeta$Brain_Region = as.factor(datMeta$Region)
datMeta$Brain_lobe = 'Occipital'
datMeta$Brain_lobe[datMeta$Brain_Region %in% c('BA4_6', 'BA9', 'BA24', 'BA44_45')] = 'Frontal'
datMeta$Brain_lobe[datMeta$Brain_Region %in% c('BA3_1_2_5', 'BA7')] = 'Parietal'
datMeta$Brain_lobe[datMeta$Brain_Region %in% c('BA38', 'BA39_40', 'BA20_37', 'BA41_42_22')] = 'Temporal'
datMeta$Brain_lobe=factor(datMeta$Brain_lobe, levels=c('Frontal', 'Temporal', 'Parietal', 'Occipital'))

#################################################################################
# FILTERS:

# 1 Filter probes with start or end position missing (filter 5)
# These can be filtered without probe info, they have weird IDS that don't start with ENS
to_keep = !is.na(datProbes$length)
datProbes = datProbes[to_keep,]
datExpr = datExpr[to_keep,]
rownames(datProbes) = datProbes$ensembl_gene_id

# 2. Filter samples from ID AN03345 (filter 2)
to_keep = (datMeta$Subject_ID != 'AN03345')
datMeta = datMeta[to_keep,]
datExpr = datExpr[,to_keep]

# 3. Filter samples with rowSums <= 40 (at least half of the samples without any expression)
to_keep = rowSums(datExpr)>40
datExpr = datExpr[to_keep,]
datProbes = datProbes[to_keep,]

#################################################################################
# Annotate SFARI genes

# Get ensemble IDS for SFARI genes
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL',
               dataset='hsapiens_gene_ensembl',
               host='feb2014.archive.ensembl.org') ## Gencode v19

gene_names = getBM(attributes=c('ensembl_gene_id', 'hgnc_symbol'), filters=c('hgnc_symbol'), 
                   values=SFARI_genes$`gene-symbol`, mart=mart) %>% 
                   mutate('gene-symbol'=hgnc_symbol, 'ID'=as.character(ensembl_gene_id)) %>% 
                   dplyr::select('ID', 'gene-symbol')

SFARI_genes = left_join(SFARI_genes, gene_names, by='gene-symbol')

#################################################################################
# Add functional annotation to genes from GO

GO_annotations = read.csv('./../working_data/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>% 
              mutate('ID' = as.character(ensembl_gene_id)) %>% 
              dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
              mutate('Neuronal' = 1)


datExpr_backup = datExpr

rm(getinfo, to_keep, gene_names, mart)

Number of genes:

nrow(datExpr)
## [1] 33478

Gene count by SFARI score:

table(SFARI_genes$`gene-score`)
## 
##   1   2   3   4   5   6 
##  29  82 209 538 191  25

Gene count by brain lobe:

table(datMeta$Brain_lobe)
## 
##   Frontal  Temporal  Parietal Occipital 
##        21        20        22        23

Gene count by Diagnosis and brain lobe:

t(table(datMeta$Brain_lobe, datMeta$Diagnosis_))
##      
##       Frontal Temporal Parietal Occipital
##   ASD       8       14       14        15
##   CTL      13        6        8         8



Boxplots of difference in mean between Diagnosis by score for raw data

make_ASD_vs_CTL_df = function(datExpr, lobe){
  datMeta_lobe = datMeta %>% filter(Brain_lobe==lobe & rownames(datMeta) %in% colnames(datExpr))
  datExpr_lobe = log2(datExpr+1)
  datExpr_ASD = datExpr %>% data.frame %>% dplyr::select(which(datMeta_lobe$Diagnosis_=='ASD'))
  datExpr_CTL = datExpr %>% data.frame %>% dplyr::select(which(datMeta_lobe$Diagnosis_!='ASD'))
  
  ASD_vs_CTL = data.frame('ID'=as.character(rownames(datExpr)),
                          'mean_ASD'=rowMeans(datExpr_ASD), 'mean_CTL'=rowMeans(datExpr_CTL),
                          'sd_ASD'=apply(datExpr_ASD,1,sd), 'sd_CTL'=apply(datExpr_CTL,1,sd)) %>%
               mutate('mean_diff'=mean_ASD-mean_CTL, 'sd_diff'=sd_ASD-sd_CTL) %>%
               left_join(SFARI_genes, by='ID') %>%
               dplyr::select(ID, mean_ASD, mean_CTL, mean_diff, sd_ASD, sd_CTL, sd_diff, `gene-score`) %>%
               mutate('Neuronal'=ifelse(ID %in% GO_neuronal$ID, 'Neuronal', 'Non-Neuronal'))
  
  ASD_vs_CTL_SFARI = ASD_vs_CTL %>% filter(!is.na(`gene-score`)) %>% 
                     mutate(Group = as.character(`gene-score`)) %>% dplyr::select(-c(Neuronal,`gene-score`))
  ASD_vs_CTL_Neuro = ASD_vs_CTL %>% mutate(Group = Neuronal) %>% dplyr::select(-c(Neuronal,`gene-score`))
  ASD_vs_CTL_all = ASD_vs_CTL %>% mutate(Group = 'All') %>% dplyr::select(-c(Neuronal,`gene-score`))
  
  ASD_vs_CTL_together = bind_rows(ASD_vs_CTL_SFARI, ASD_vs_CTL_Neuro, ASD_vs_CTL_all)
  
  return(ASD_vs_CTL_together)
}

p = list()
for(lobe in names(table(datMeta$Brain_lobe))){
  datExpr_lobe = datExpr %>% dplyr::select(which(datMeta$Brain_lobe==lobe))
  ASD_vs_CTL = make_ASD_vs_CTL_df(datExpr_lobe, lobe)
  plot = ggplotly(ggplot(ASD_vs_CTL, aes(Group, abs(mean_diff), fill=Group)) + 
                  geom_boxplot() + theme_minimal() + ylim(0, 1000) +
                  scale_fill_manual(values=gg_colour_hue(9)) +
                  theme(legend.position='none', axis.text.x=element_text(angle=45, hjust=1)))
  p[lobe] = list(plot)
}

table(ASD_vs_CTL$Group)
## 
##            1            2            3            4            5 
##           24           60          188          433          167 
##            6          All     Neuronal Non-Neuronal 
##           24        33479         1165        32314
plotly::subplot(p[[1]], p[[2]], p[[3]], p[[4]], nrows=2)
rm(p, lobe, datExpr_lobe, plot)

Normalise data using vst

datExpr = datExpr_backup # Just in case
# datMeta_backup = datMeta

vst_norm = function(lobe){
  
  datExpr_lobe = datExpr %>% dplyr::select(which(datMeta$Brain_lobe==lobe))
  datMeta_lobe = datMeta %>% filter(Brain_lobe==lobe & rownames(datMeta) %in% colnames(datExpr))
  
  counts = as.matrix(datExpr_lobe)
  rowRanges = GRanges(datProbes$chromosome_name,
                      IRanges(datProbes$start_position, width=datProbes$length),
                      strand=datProbes$strand,
                      feature_id=datProbes$ensembl_gene_id)
  
  se = SummarizedExperiment(assays=SimpleList(counts=counts), rowRanges=rowRanges, colData=datMeta_lobe)
  dds = DESeqDataSet(se, design =~Diagnosis_)
  
  #Estimate size factors
  dds = estimateSizeFactors(dds)
  
  vst_output = vst(dds)
  datExpr_lobe = assay(vst_output)
  
  # Filter out genes with 0 variance
  to_keep = apply(datExpr_lobe, 1, sd)>0
  print(glue(lobe,': Filtering out ', sum(!to_keep), ' probes with 0 variance, keeping ', sum(to_keep)))
  datExpr_lobe = datExpr_lobe[to_keep,]
  
  return(datExpr_lobe)
}

datExpr_postNorm = list()
for(lobe in names(table(datMeta$Brain_lobe))){
  datExpr_postNorm[[lobe]] = vst_norm(lobe)
}
## Frontal: Filtering out 13 probes with 0 variance, keeping 33465
## Temporal: Filtering out 2 probes with 0 variance, keeping 33476
## Parietal: Filtering out 5 probes with 0 variance, keeping 33473
## Occipital: Filtering out 3 probes with 0 variance, keeping 33475

Boxplots of difference in mean between Diagnosis by score for normalised data

  • Regions: Frontal, Temporal, Parietal and Occipital

  • Similar behaviour in all regions

p = list()
for(lobe in names(table(datMeta$Brain_lobe))){
  datExpr_lobe = datExpr_postNorm[[lobe]]
  ASD_vs_CTL = make_ASD_vs_CTL_df(datExpr_lobe, lobe)
  plot = ggplotly(ggplot(ASD_vs_CTL, aes(Group, abs(mean_diff), fill=Group)) + 
                geom_boxplot() + theme_minimal() + 
                scale_fill_manual(values=gg_colour_hue(9)) +
                theme(legend.position='none', axis.text.x=element_text(angle=45, hjust=1)))
  p[lobe] = list(plot)
}

plotly::subplot(p[[1]], p[[2]], p[[3]], p[[4]], nrows=2)
rm(p, datExpr_lobe, ASD_vs_CTL, plot)

Calculate DE by region

Seems like the Parietal lobe is the only one with a significant number of DE genes

DE_by_region = function(datExpr, datMeta){
  
  counts = as.matrix(datExpr)
  rowRanges = GRanges(datProbes$chromosome_name,
                      IRanges(datProbes$start_position, width=datProbes$length),
                      strand=datProbes$strand,
                      feature_id=datProbes$ensembl_gene_id)
  
  se = SummarizedExperiment(assays=SimpleList(counts=counts), rowRanges=rowRanges, colData=datMeta)
  ddsSE = DESeqDataSet(se, design =~Diagnosis_)
  
  dds = DESeq(ddsSE)
  DE_info = results(dds) %>% data.frame %>% rownames_to_column(var = 'ID') %>%
            mutate('logFC'=log2FoldChange, 'adj.P.Val'=padj) %>% 
            dplyr::select(ID, logFC, adj.P.Val)
  
  # mod = model.matrix(~ Diagnosis_, data=datMeta)
  # corfit = duplicateCorrelation(datExpr, mod, block=datMeta$Subject_ID)
  # lmfit = lmFit(datExpr, design=mod, correlation=corfit$consensus)
  # 
  # fit = eBayes(lmfit, trend=T, robust=T)
  # top_genes = topTable(fit, coef=2, number=nrow(datExpr))
  # DE_info = top_genes[match(rownames(datExpr), rownames(top_genes)),]  
}

DE_info_by_region = list()
DE_genes_by_region = list()
i=1
for(lobe in names(table(datMeta$Brain_lobe))){
  # datExpr_lobe = datExpr_postNorm[[lobe]] # NO! RAW COUNTS!
  datExpr_lobe = datExpr %>% dplyr::select(which(datMeta$Brain_lobe==lobe))
  datMeta_lobe = datMeta %>% filter(Brain_lobe==lobe)
  DE_info = DE_by_region(datExpr_lobe, datMeta_lobe) %>% mutate('ID'=rownames(datExpr_lobe))
  
  DE_info_by_region[[i]] = DE_info
  DE_genes_by_region[[i]] = DE_info %>% filter(adj.P.Val<0.05 & abs(logFC)>log2(1.2)) %>% dplyr::select(ID)
  i = i+1
  
  print(glue(lobe,' lobe: ', sum(DE_info$adj.P.Val<0.05 & abs(DE_info$logFC)>log2(1.2), na.rm=T),
             ' DE genes'))
}
## Frontal lobe: 168 DE genes
## Temporal lobe: 35 DE genes
## Parietal lobe: 3369 DE genes
## Occipital lobe: 89 DE genes
names(DE_info_by_region) = names(table(datMeta$Brain_lobe))
names(DE_genes_by_region) = names(table(datMeta$Brain_lobe))

Venn Diagrams of DE genes for each region

# venneuler package (areas do correspond to counts)
DE_genes_df = data.frame('Frontal' = rownames(datExpr) %in% DE_genes_by_region[['Frontal']]$ID,
                         'Temporal' = rownames(datExpr) %in% DE_genes_by_region[['Temporal']]$ID,
                         'Parietal' = rownames(datExpr) %in% DE_genes_by_region[['Parietal']]$ID,
                         'Occipital' = rownames(datExpr) %in% DE_genes_by_region[['Occipital']]$ID)
rownames(DE_genes_df) = rownames(datExpr)

plot(venneuler(DE_genes_df))

# VennDiagram package (areas do not correspond to counts)
grid.newpage()
grid.draw(draw.quad.venn(nrow(DE_genes_by_region[[1]]), nrow(DE_genes_by_region[[2]]),
                         nrow(DE_genes_by_region[[3]]), nrow(DE_genes_by_region[[4]]),
          length(intersect(DE_genes_by_region[[1]]$ID, DE_genes_by_region[[2]]$ID)),
          length(intersect(DE_genes_by_region[[1]]$ID, DE_genes_by_region[[3]]$ID)),
          length(intersect(DE_genes_by_region[[1]]$ID, DE_genes_by_region[[4]]$ID)),
          length(intersect(DE_genes_by_region[[2]]$ID, DE_genes_by_region[[3]]$ID)),
          length(intersect(DE_genes_by_region[[2]]$ID, DE_genes_by_region[[4]]$ID)),
          length(intersect(DE_genes_by_region[[3]]$ID, DE_genes_by_region[[4]]$ID)),
          length(intersect(intersect(DE_genes_by_region[[1]]$ID, DE_genes_by_region[[2]]$ID), DE_genes_by_region[[3]]$ID)),
          length(intersect(intersect(DE_genes_by_region[[1]]$ID, DE_genes_by_region[[2]]$ID), DE_genes_by_region[[4]]$ID)),
          length(intersect(intersect(DE_genes_by_region[[1]]$ID, DE_genes_by_region[[3]]$ID), DE_genes_by_region[[4]]$ID)),
          length(intersect(intersect(DE_genes_by_region[[2]]$ID, DE_genes_by_region[[3]]$ID), DE_genes_by_region[[4]]$ID)),
          length(intersect(intersect(intersect(DE_genes_by_region[[1]]$ID, DE_genes_by_region[[2]]$ID), 
                                     DE_genes_by_region[[3]]$ID), DE_genes_by_region[[4]]$ID)),
          category = names(DE_genes_by_region), fill = hcl(h=seq(15, 375, length=5), l=65, c=100)[-5],
          alpha = rep(0.25,4), lty = rep('blank', 4)))

rm(i, lobe, datExpr_lobe, datMeta_lobe, DE_info, DE_genes_df)

Perform PCA for all lobes

PC1 explains the average expression and PC2 log fold change. 1st PC still explains over 95% of the variance

reduce_dim_datExpr = function(datExpr, datMeta, var_explained=0.95){

  datExpr_pca = prcomp(log2(datExpr+1), scale=TRUE)
  last_pc = data.frame(summary(datExpr_pca)$importance[3,]) %>% rownames_to_column(var='ID') %>% 
            filter(.[[2]] >= var_explained) %>% top_n(-1, ID)
  
  print(glue('Keeping top ', substr(last_pc$ID, 3, nchar(last_pc$ID)), ' components that explain ',
             var_explained*100, '% of the variance'))
  
  datExpr_top_pc = datExpr_pca$x %>% data.frame %>% dplyr::select(PC1:last_pc$ID)
  
  return(list('datExpr'=datExpr_top_pc, 'pca_output'=datExpr_pca))
}

plt = list()
i = 1
for(lobe in names(table(datMeta$Brain_lobe))){
  
  datExpr_lobe = datExpr %>% filter(rownames(datExpr) %in% DE_genes_by_region[[lobe]]$ID) %>%
                 dplyr::select(which(datMeta$Brain_lobe==lobe))
  to_keep = apply(datExpr_lobe, 1, sd)>0
  datExpr_lobe = datExpr_lobe %>% filter(to_keep)
  rownames(datExpr_lobe) = rownames(datExpr)[rownames(datExpr) %in% DE_genes_by_region[[lobe]]$ID][to_keep]
  datMeta_lobe = datMeta %>% filter(Brain_lobe==lobe)
  red_dim = reduce_dim_datExpr(datExpr_lobe, datMeta_lobe)
  
  pca_lobe = red_dim$datExpr %>% mutate('ID' = rownames(red_dim$datExpr)) %>%
             left_join(SFARI_genes, by='ID') %>% dplyr::select(ID, PC1, PC2, `gene-score`) %>%
             mutate(`gene-score`=ifelse(is.na(`gene-score`), 'None', `gene-score`)) %>%
             left_join(DE_info_by_region[[lobe]], by='ID') %>% distinct(ID, .keep_all=TRUE) %>%
             mutate('abs_lFC'=abs(logFC), 'gene-score'=ifelse(`gene-score`=='None' & ID %in% GO_neuronal$ID, 
                    'Neuronal', `gene-score`), 'avg_expr'=rowMeans(log2(datExpr_lobe+1)))
  
  n = length(unique(pca_lobe$`gene-score`))+1
  p = ggplotly(pca_lobe %>% ggplot(aes(PC1, PC2, fill=`gene-score`, color=`gene-score`)) +
           geom_point(alpha=0.5) + theme_minimal() + ggtitle(lobe) +
           scale_fill_manual(values=gg_colour_hue(n)) +
           scale_color_manual(values=gg_colour_hue(n))) 
  
  plt[[i]] = p
  i = i+1
}
## Keeping top 10 components that explain 95% of the variance
## Keeping top 10 components that explain 95% of the variance
## Keeping top 10 components that explain 95% of the variance
## Keeping top 10 components that explain 95% of the variance
#lobe = 'Parietal'

rm(to_keep, i)
plt[[1]] ; plt[[2]] ; plt[[3]] ; plt[[4]]

Changes in PCA plots for different filtering thresholds for Parietal samples

lfc=-1 means no filtering at all, the rest of the filterings include on top of the defined lfc, an adjusted p-value lower than 0.05

lfc_list = seq(0, 3, 0.1)

plt = list()
i=1

for(lobe in names(table(datMeta$Brain_lobe))){
  datExpr_lobe = datExpr %>% dplyr::select(which(datMeta$Brain_lobe==lobe))
  to_keep = apply(datExpr_lobe, 1, sd)>0
  datExpr_lobe = datExpr_lobe %>% filter(to_keep)
  rownames(datExpr_lobe) = rownames(datExpr)[to_keep]
  datMeta_lobe = datMeta %>% filter(Brain_lobe==lobe)
  
  n_genes = nrow(datExpr_lobe)
  
  # Calculate PCAs
  datExpr_pca_samps = log2(datExpr_lobe+1) %>% data.frame %>% t %>% prcomp(scale.=TRUE)
  datExpr_pca_genes = log2(datExpr_lobe+1) %>% data.frame %>% prcomp(scale.=TRUE)
  
  # Initialice DF to save PCA outputs
  pcas_samps = datExpr_pca_samps$x %>% data.frame %>% dplyr::select(PC1:PC2) %>% 
               mutate('ID'=colnames(datExpr_lobe), 'lfc'=-1, PC1=scale(PC1), PC2=scale(PC2))
  pcas_genes = datExpr_pca_genes$x %>% data.frame %>% dplyr::select(PC1:PC2) %>% 
               mutate('ID'=rownames(datExpr_lobe), 'lfc'=-1, PC1=scale(PC1), PC2=scale(PC2))
  
  pca_samps_old = pcas_samps
  pca_genes_old = pcas_genes
  
  for(lfc in lfc_list){
    
    # Filter DE genes with iteration's criteria
    DE_genes = DE_info_by_region[[lobe]] %>% filter(adj.P.Val<0.05 & abs(logFC)>lfc & ID %in% rownames(datExpr_lobe))
    datExpr_DE = datExpr_lobe %>% data.frame %>% filter(rownames(.) %in% DE_genes$ID)
    n_genes = c(n_genes, nrow(DE_genes))
    
    # Calculate PCAs
    datExpr_pca_samps = log2(datExpr_DE+1) %>% t %>% prcomp(scale.=TRUE)
    datExpr_pca_genes = log2(datExpr_DE+1) %>% prcomp(scale.=TRUE)
  
    # Create new DF entries
    pca_samps_new = datExpr_pca_samps$x %>% data.frame %>% dplyr::select(PC1:PC2) %>% 
                    mutate('ID'=colnames(datExpr_lobe), 'lfc'=lfc, PC1=scale(PC1), PC2=scale(PC2))
    pca_genes_new = datExpr_pca_genes$x %>% data.frame %>% dplyr::select(PC1:PC2) %>% 
                    mutate('ID'=DE_genes$ID, 'lfc'=lfc, PC1=scale(PC1), PC2=scale(PC2))  
    
    # Change PC sign if necessary
    if(cor(pca_samps_new$PC1, pca_samps_old$PC1)<0) pca_samps_new$PC1 = -pca_samps_new$PC1
    if(cor(pca_samps_new$PC2, pca_samps_old$PC2)<0) pca_samps_new$PC2 = -pca_samps_new$PC2
    if(cor(pca_genes_new$PC1, pca_genes_old[pca_genes_old$ID %in% pca_genes_new$ID,]$PC1 )<0){
      pca_genes_new$PC1 = -pca_genes_new$PC1
    }
    if(cor(pca_genes_new$PC2, pca_genes_old[pca_genes_old$ID %in% pca_genes_new$ID,]$PC2 )<0){
      pca_genes_new$PC2 = -pca_genes_new$PC2
    }
    
    pca_samps_old = pca_samps_new
    pca_genes_old = pca_genes_new
    
    # Update DFs
    pcas_samps = rbind(pcas_samps, pca_samps_new)
    pcas_genes = rbind(pcas_genes, pca_genes_new)
    
  }
  
  # Add Diagnosis/SFARI score information
  pcas_samps = pcas_samps %>% mutate('ID'=substring(ID,2)) %>% 
               left_join(datMeta_lobe, by=c('ID'='Dissected_Sample_ID')) %>%
               dplyr::select(ID, PC1, PC2, lfc, Diagnosis_, Brain_lobe)
  pcas_genes = pcas_genes %>% left_join(SFARI_genes, by='ID') %>% 
               mutate('score'=as.factor(`gene-score`)) %>%
               dplyr::select(ID, PC1, PC2, lfc, score)
  
  #########################################################################################################
  # Plot change of number of genes
  p = ggplotly(data.frame('lfc'=lfc_list, 'n_genes'=n_genes[-1]) %>% ggplot(aes(x=lfc, y=n_genes)) + 
           geom_point() + geom_line() + theme_minimal() + 
           ggtitle(paste0(lobe, ' lobe')))
  plt[[i]] = p
  i = i+1
  
  #########################################################################################################
  # Calculate percentage of genes remaining on each lfc by each score
  pcas_genes_backup = pcas_genes
  
  pcas_genes = pcas_genes %>% mutate(score=ifelse(is.na(score) & ID %in% GO_neuronal$ID, 'Neuronal', score))
  
  score_count_by_lfc = pcas_genes %>% filter(!is.na(score)) %>% group_by(lfc, score) %>% tally %>% ungroup
  score_count_pcnt = score_count_by_lfc %>% filter(lfc==-1) %>% mutate('n_init'=n) %>%
                     dplyr::select(score, n_init) %>% right_join(score_count_by_lfc, by='score') %>%
                     mutate('pcnt'=round(n/n_init*100, 2)) %>% filter(lfc!=-1)
  
  # Complete missing entries with zeros
  complete_score_count_pcnt = expand.grid(unique(score_count_pcnt$lfc), unique(score_count_pcnt$score))
  colnames(complete_score_count_pcnt) = c('lfc', 'score')
  score_count_pcnt = full_join(score_count_pcnt, complete_score_count_pcnt, by=c('lfc','score')) %>%
                     dplyr::select(score, lfc, n, pcnt)
  score_count_pcnt[is.na(score_count_pcnt)] = 0
  
  # Join counts by score and all genes
  all_count_pcnt = pcas_genes %>% group_by(lfc) %>% tally  %>% filter(lfc!=-1) %>% 
                   mutate('pcnt'=round(n/nrow(datExpr)*100, 2), 'score'='All')
  all_m_neuro_count_pcnt = pcas_genes %>% filter(is.na(score)) %>% group_by(lfc) %>% tally %>% filter(lfc!=-1) %>% 
                   mutate('pcnt'=round(n/nrow(datExpr)*100, 2), 'score'='Non-Neuronal')
  score_count_pcnt = rbind(score_count_pcnt, all_count_pcnt, all_m_neuro_count_pcnt)
  
  p = ggplotly(score_count_pcnt %>% ggplot(aes(lfc, pcnt, color=score)) + geom_point() + geom_line() + 
           theme_minimal() + scale_colour_manual(values=gg_colour_hue(9)) +
           ggtitle('% of points left after each increase in log2 fold change'))
  plt[[i]] = p
  i = i+1
  
  #########################################################################################################
  p = ggplotly(pcas_samps %>% ggplot(aes(PC1, PC2, color=Diagnosis_)) + geom_point(aes(frame=lfc, ids=ID)) + 
           theme_minimal() + ggtitle(paste0('Samples PCA plot for ', lobe, ' lobe')))
  plt[[i]] = p
  i = i+1
}


rm(datExpr_pca_genes, datExpr_pca_samps, DE_genes, datExpr_DE, pca_genes_new, pca_samps_new, 
   pca_genes_old, pca_samps_old, lfc_list, lfc, score_count_by_lfc, complete_score_count_pcnt)
plt[[1]] ; plt[[2]]  ; plt[[3]]  ; plt[[4]]

plt[[5]] ; plt[[6]]  ; plt[[7]]  ; plt[[8]]

plt[[9]] ; plt[[10]] ; plt[[11]] ; plt[[12]]